{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HTML解析（parse HTML）及Xpath实践练习\n",
    "- week05 作业\n",
    "- 复习本周学习内容——HTML解析（parse HTML）及Xpath实践。\n",
    "- 完成对广州南方学院五个部分（学校要闻、校园动态、通知公告、招投标、高教动态）的数据爬虫抓取后存入Excel中，将抓取代码文档以及Excel文档放入gitee/github，提交其链接。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 学校要闻"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### html 页面数据的存与读"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/xxyw/index.htm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_学校要闻week5.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_学校要闻week5.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()\n",
    "    html_load"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### soup_html 解析 ： str的html文件 => element html元素文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x231044b5040>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "xxyw_parsed = requests_html.soup_parse(html_load)\n",
    "xxyw_parsed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 解析、重组链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/xxyw/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae92f7535a7d60c9.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04becfaf48b3138e20.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/f0002a2424f34ad8b258adb1d07ca28b.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a2cdc278fc884ea.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d241e79365b6290.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/debb2f222e024cbda5d2644acb6c552c.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/e5378134dbaf4b7b88d3003f1cd99e59.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/7c865b16b203467ab6ddf5569f73e5c1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/28b0ad0eee8149e6b7f4ae65395910ff.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/c48c33c8f744430eb9417b800a8b2e3f.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/395b8e2ba5df47c59d080d50d1113be1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/59bda093ced440f78c638ade40ab0b93.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/1af5590575b74762b624f048b5ad79f4.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/4e32521de0da4d21979182e1b114a964.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/23279088871e4b89b8eab2e7fbc77b17.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/a5de3999469447b488857144f58f8c27.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/6273fd9185b54b20a0af15b9878f1d2c.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/a1f9ac1d39704e4d8136478ec97e3635.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/c438a1ec6db5446faf76617654b5ca55.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/f28729353ff749b9b170825ffe346949.htm']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in xxyw_parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 翻页查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "91\n"
     ]
    }
   ],
   "source": [
    "# 有多少页？\n",
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/xxyw/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xxyw/index.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index2.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index3.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index4.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index5.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index6.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index7.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index8.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index9.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index10.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index11.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index12.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index13.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index14.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index15.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index16.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index17.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index18.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index19.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index20.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index21.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index22.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index23.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index24.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index25.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index26.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index27.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index28.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index29.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index30.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index31.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index32.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index33.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index34.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index35.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index36.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index37.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index38.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index39.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index40.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index41.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index42.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index43.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index44.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index45.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index46.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index47.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index48.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index49.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index50.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index51.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index52.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index53.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index54.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index55.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index56.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index57.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index58.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index59.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index60.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index61.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index62.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index63.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index64.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index65.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index66.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index67.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index68.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index69.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index70.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index71.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index72.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index73.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index74.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index75.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index76.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index77.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index78.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index79.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index80.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index81.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index82.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index83.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index84.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index85.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index86.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index87.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index88.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index89.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index90.htm']"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 实现翻页的url队列\n",
    "url_xxyw_group = ['https://www.nfu.edu.cn/xxyw/index'+str(i)+'.htm' for i in range(1,91)]\n",
    "url_xxyw_group.insert(0,'https://www.nfu.edu.cn/xxyw/index.htm')\n",
    "url_xxyw_group"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存html文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/xxyw/index.htm'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_xxyw_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_xxyw_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存excel文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(xxyw_parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index26.htm', 'index27.htm', 'index28.htm', 'index29.htm', 'index3.htm', 'index30.htm', 'index31.htm', 'index32.htm', 'index33.htm', 'index34.htm', 'index35.htm', 'index36.htm', 'index37.htm', 'index38.htm', 'index39.htm', 'index4.htm', 'index40.htm', 'index41.htm', 'index42.htm', 'index43.htm', 'index44.htm', 'index45.htm', 'index46.htm', 'index47.htm', 'index48.htm', 'index49.htm', 'index5.htm', 'index50.htm', 'index51.htm', 'index52.htm', 'index53.htm', 'index54.htm', 'index55.htm', 'index56.htm', 'index57.htm', 'index58.htm', 'index59.htm', 'index6.htm', 'index60.htm', 'index61.htm', 'index62.htm', 'index63.htm', 'index64.htm', 'index65.htm', 'index66.htm', 'index67.htm', 'index68.htm', 'index69.htm', 'index7.htm', 'index70.htm', 'index71.htm', 'index72.htm', 'index73.htm', 'index74.htm', 'index75.htm', 'index76.htm', 'index77.htm', 'index78.htm', 'index79.htm', 'index8.htm', 'index80.htm', 'index81.htm', 'index82.htm', 'index83.htm', 'index84.htm', 'index85.htm', 'index86.htm', 'index87.htm', 'index88.htm', 'index89.htm', 'index9.htm', 'index90.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链接</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>我校召开高校教师职称评审 政策解读专题报告会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f0002a2424f34ad8b2...</td>\n",
       "      <td>2021-04-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>快！来为我校大学生国旗护卫队参赛点赞！</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae...</td>\n",
       "      <td>2021-04-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>我校承办首届 “新时代从商培养工程”</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>专注当下，冲刺高考，奋斗出最美的青春</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04be...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>我校召开2021年一流专业、一流课程、教学成果奖申报工作推进会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1777</th>\n",
       "      <td>17</td>\n",
       "      <td>首届从化地区学工部（处）联谊会在我院举行</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/5a530157f3764b32ad...</td>\n",
       "      <td>2013-09-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1779</th>\n",
       "      <td>19</td>\n",
       "      <td>学院教学工作会议顺利召开</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/b844901be7a6412eb7...</td>\n",
       "      <td>2013-09-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1800</th>\n",
       "      <td>0</td>\n",
       "      <td>我院召开新进教职工座谈会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/460879ee62c94531ba...</td>\n",
       "      <td>2013-09-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1801</th>\n",
       "      <td>1</td>\n",
       "      <td>我院2013级新生军训正式开始</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/9ae5ab09744e4d808a...</td>\n",
       "      <td>2013-09-17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1802</th>\n",
       "      <td>2</td>\n",
       "      <td>我院2013级新生“安全法纪教育”讲座顺利举行</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/e8f0aa3bb74d43cbb0...</td>\n",
       "      <td>2013-09-17</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1803 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      index                               标题  \\\n",
       "2         2           我校召开高校教师职称评审 政策解读专题报告会   \n",
       "0         0              快！来为我校大学生国旗护卫队参赛点赞！   \n",
       "4         4               我校承办首届 “新时代从商培养工程”   \n",
       "1         1               专注当下，冲刺高考，奋斗出最美的青春   \n",
       "3         3  我校召开2021年一流专业、一流课程、教学成果奖申报工作推进会   \n",
       "...     ...                              ...   \n",
       "1777     17             首届从化地区学工部（处）联谊会在我院举行   \n",
       "1779     19                     学院教学工作会议顺利召开   \n",
       "1800      0                     我院召开新进教职工座谈会   \n",
       "1801      1                  我院2013级新生军训正式开始   \n",
       "1802      2          我院2013级新生“安全法纪教育”讲座顺利举行   \n",
       "\n",
       "                                                     链接          日期  \n",
       "2     https://www.nfu.edu.cn/xxyw/f0002a2424f34ad8b2...  2021-04-10  \n",
       "0     https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae...  2021-04-09  \n",
       "4     https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d...  2021-04-02  \n",
       "1     https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04be...  2021-04-02  \n",
       "3     https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a...  2021-04-02  \n",
       "...                                                 ...         ...  \n",
       "1777  https://www.nfu.edu.cn/xxyw/5a530157f3764b32ad...  2013-09-29  \n",
       "1779  https://www.nfu.edu.cn/xxyw/b844901be7a6412eb7...  2013-09-26  \n",
       "1800  https://www.nfu.edu.cn/xxyw/460879ee62c94531ba...  2013-09-25  \n",
       "1801  https://www.nfu.edu.cn/xxyw/9ae5ab09744e4d808a...  2013-09-17  \n",
       "1802  https://www.nfu.edu.cn/xxyw/e8f0aa3bb74d43cbb0...  2013-09-17  \n",
       "\n",
       "[1803 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/xxyw/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/xxyw/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链接\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_广州南方学院.xlsx',mode='a',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='学校要闻')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 校园动态"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### html页面数据存读"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/xydt/index.htm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_校园动态week5.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_校园动态week5.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()\n",
    "    html_load"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### soup_html 解析 ： str的html文件 => element html元素文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x2310494bc20>"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# HTML文本解析 \n",
    "xydt_parsed = requests_html.soup_parse(html_load)\n",
    "xydt_parsed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 解析、重组链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/xydt/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xydt/7dfe6fcd15fd495597cbd282de863733.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/09627d3243ee4578ac69be2881abd8b3.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/debea203b0c84a3092e6b5416cc4c2f1.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/9ec16bf90e164071b68a57332c5fe020.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/cf4420785b9046e99851413a1fb1b6f7.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/16f4c5f4bd284caebfe79cd5d66e288b.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/935f580040704990a4e396fa8091ee30.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/9611d110ec8a486587ab4020171ee9f5.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/8b2414ee7cca45d88c4217dd13f8f8ec.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/a9523b72a34e4143afa9b38879ecba0c.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/f8434c9f092348c2a4a3c1ac8727a8fd.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/d60d33983337463390ef99385afce119.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/4b0c8e69b5074d28badd20cb55436009.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/bdda4dfbda944a3eb84612b7045620f4.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/96e05388e3fa43de9a8f446b083875f8.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/25dc7cb574284be18a6c9a0640e5aca3.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/b9777111c2194e7b85143431ab4706a7.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/2f3dcc0f4400419e8e42af09fde3c251.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/c3846031b4c0444a99e0dfd90047c046.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/5e165fdaee834899891ba5b3eea69bc9.htm']"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in xydt_parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 翻页查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "85\n"
     ]
    }
   ],
   "source": [
    "# 有多少页？\n",
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/xydt/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xydt/index.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index1.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index2.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index3.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index4.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index5.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index6.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index7.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index8.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index9.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index10.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index11.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index12.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index13.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index14.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index15.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index16.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index17.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index18.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index19.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index20.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index21.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index22.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index23.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index24.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index25.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index26.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index27.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index28.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index29.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index30.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index31.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index32.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index33.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index34.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index35.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index36.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index37.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index38.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index39.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index40.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index41.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index42.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index43.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index44.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index45.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index46.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index47.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index48.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index49.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index50.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index51.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index52.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index53.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index54.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index55.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index56.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index57.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index58.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index59.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index60.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index61.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index62.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index63.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index64.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index65.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index66.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index67.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index68.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index69.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index70.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index71.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index72.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index73.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index74.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index75.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index76.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index77.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index78.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index79.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index80.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index81.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index82.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index83.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/index84.htm']"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 实现翻页的url队列\n",
    "url_xydt_group = ['https://www.nfu.edu.cn/xydt/index'+str(i)+'.htm' for i in range(1,85)]\n",
    "url_xydt_group.insert(0,'https://www.nfu.edu.cn/xydt/index.htm')\n",
    "url_xydt_group"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存html文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/xydt/index.htm'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_xydt_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_xydt_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存excel文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(xydt_parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index26.htm', 'index27.htm', 'index28.htm', 'index29.htm', 'index3.htm', 'index30.htm', 'index31.htm', 'index32.htm', 'index33.htm', 'index34.htm', 'index35.htm', 'index36.htm', 'index37.htm', 'index38.htm', 'index39.htm', 'index4.htm', 'index40.htm', 'index41.htm', 'index42.htm', 'index43.htm', 'index44.htm', 'index45.htm', 'index46.htm', 'index47.htm', 'index48.htm', 'index49.htm', 'index5.htm', 'index50.htm', 'index51.htm', 'index52.htm', 'index53.htm', 'index54.htm', 'index55.htm', 'index56.htm', 'index57.htm', 'index58.htm', 'index59.htm', 'index6.htm', 'index60.htm', 'index61.htm', 'index62.htm', 'index63.htm', 'index64.htm', 'index65.htm', 'index66.htm', 'index67.htm', 'index68.htm', 'index69.htm', 'index7.htm', 'index70.htm', 'index71.htm', 'index72.htm', 'index73.htm', 'index74.htm', 'index75.htm', 'index76.htm', 'index77.htm', 'index78.htm', 'index79.htm', 'index8.htm', 'index80.htm', 'index81.htm', 'index82.htm', 'index83.htm', 'index84.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链接</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>【国奖映像】苏绮筠：让优秀成为习惯</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/7dfe6fcd15fd495597...</td>\n",
       "      <td>2021-04-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>护理与健康学院2020-2021年度第二学期3月份团支部委员会顺利举行</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/9ec16bf90e164071b6...</td>\n",
       "      <td>2021-04-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>【国奖映像】陈宇：心怀热爱，奔赴梦想</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/09627d3243ee4578ac...</td>\n",
       "      <td>2021-04-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>护理与健康学院2020-2021第二学期团员培训课程第2讲圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/debea203b0c84a3092...</td>\n",
       "      <td>2021-04-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>商学院电子商务专业召开申请调整学位授予学科门类 专家评审会</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/cf4420785b9046e998...</td>\n",
       "      <td>2021-04-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1663</th>\n",
       "      <td>3</td>\n",
       "      <td>广东技术师范学院大学英语部与我院大学英语教学中心教师交流会顺利举行</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/5cc461d4a37a4afb8d...</td>\n",
       "      <td>2016-01-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1664</th>\n",
       "      <td>4</td>\n",
       "      <td>经济学与商务管理系顺利召开办公培训会议</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/9b59863e5051412d80...</td>\n",
       "      <td>2016-01-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1665</th>\n",
       "      <td>5</td>\n",
       "      <td>严谨为学，诚信迎考--工商管理系班级期末总结暨诚信考试动员大会圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/3d02b255690e4ce796...</td>\n",
       "      <td>2016-01-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1666</th>\n",
       "      <td>6</td>\n",
       "      <td>以学生为本，做优秀学生干部——我院“青马工程”第二讲举行</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/0132532dbb0e448d82...</td>\n",
       "      <td>2015-10-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1667</th>\n",
       "      <td>7</td>\n",
       "      <td>经济学与商务管理系党总支第17期入党积极分子实践活动圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/2d20a911787b4cc09c...</td>\n",
       "      <td>2015-03-28</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1688 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      index                                   标题  \\\n",
       "0         0                    【国奖映像】苏绮筠：让优秀成为习惯   \n",
       "3         3  护理与健康学院2020-2021年度第二学期3月份团支部委员会顺利举行   \n",
       "1         1                   【国奖映像】陈宇：心怀热爱，奔赴梦想   \n",
       "2         2    护理与健康学院2020-2021第二学期团员培训课程第2讲圆满结束   \n",
       "4         4        商学院电子商务专业召开申请调整学位授予学科门类 专家评审会   \n",
       "...     ...                                  ...   \n",
       "1663      3    广东技术师范学院大学英语部与我院大学英语教学中心教师交流会顺利举行   \n",
       "1664      4                  经济学与商务管理系顺利召开办公培训会议   \n",
       "1665      5  严谨为学，诚信迎考--工商管理系班级期末总结暨诚信考试动员大会圆满结束   \n",
       "1666      6         以学生为本，做优秀学生干部——我院“青马工程”第二讲举行   \n",
       "1667      7       经济学与商务管理系党总支第17期入党积极分子实践活动圆满结束   \n",
       "\n",
       "                                                     链接          日期  \n",
       "0     https://www.nfu.edu.cn/xydt/7dfe6fcd15fd495597...  2021-04-09  \n",
       "3     https://www.nfu.edu.cn/xydt/9ec16bf90e164071b6...  2021-04-09  \n",
       "1     https://www.nfu.edu.cn/xydt/09627d3243ee4578ac...  2021-04-09  \n",
       "2     https://www.nfu.edu.cn/xydt/debea203b0c84a3092...  2021-04-09  \n",
       "4     https://www.nfu.edu.cn/xydt/cf4420785b9046e998...  2021-04-07  \n",
       "...                                                 ...         ...  \n",
       "1663  https://www.nfu.edu.cn/xydt/5cc461d4a37a4afb8d...  2016-01-08  \n",
       "1664  https://www.nfu.edu.cn/xydt/9b59863e5051412d80...  2016-01-04  \n",
       "1665  https://www.nfu.edu.cn/xydt/3d02b255690e4ce796...  2016-01-04  \n",
       "1666  https://www.nfu.edu.cn/xydt/0132532dbb0e448d82...  2015-10-30  \n",
       "1667  https://www.nfu.edu.cn/xydt/2d20a911787b4cc09c...  2015-03-28  \n",
       "\n",
       "[1688 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/xydt/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/xydt/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链接\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_广州南方学院.xlsx',mode='a',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='校园动态')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 通知公告"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### html页面数据存读"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/tzgg/index.htm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_通知公告week5.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_通知公告week5.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()\n",
    "    html_load"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### soup_html 解析 ： str的html文件 => element html元素文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x231048e6040>"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "tzgg_parsed = requests_html.soup_parse(html_load)\n",
    "tzgg_parsed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 解析、重组链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/tzgg/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/tzgg/cd60e06378e54492946c461d5f0574be.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/16fcbd56eab04220b33684acafc1ec66.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/155655d4a7e74c76958fa484c9cc9e24.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/f381db0e5b3e4746b310806a81edbaba.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/ae83ecc6ce894bcb812a17a8fb5fbd7b.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/47ba4c0dfb1443f9945ee3e6df61c921.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/21cd39d341924ffd93228d34c585dc15.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/66fc5e810c664b919bf192de1833c5d1.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/c63acd1ca67746ea93cec0ac93e621b6.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/6e824246669d471ca24b98dfb9538f41.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/df9afab7eb564161bc1a8198b364e79d.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/b57e28f6cd5a4cb7b1e67376809d99cb.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/ad7da9ff14494749b7334b768b2e7207.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/54573ec40397435ca49607cef867b18b.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/e28b2797a45d471b883388659c0d1eb3.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/fc277cccd7f440a2bdadcde84c707815.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/f04e18c7e60b437d8c4591851809edb4.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/4df100c9d8754356a625bd01f70a2d79.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/ed13952ecd4343d99de23d32ea968eb3.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/8eece1bbe3b34e489ebf8afdc9069e0f.htm']"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in tzgg_parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 翻页查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "35\n"
     ]
    }
   ],
   "source": [
    "# 有多少页？\n",
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/tzgg/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/tzgg/index.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index1.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index2.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index3.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index4.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index5.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index6.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index7.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index8.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index9.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index10.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index11.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index12.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index13.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index14.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index15.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index16.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index17.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index18.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index19.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index20.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index21.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index22.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index23.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index24.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index25.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index26.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index27.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index28.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index29.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index30.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index31.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index32.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index33.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index34.htm']"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 实现翻页的url队列\n",
    "url_tzgg_group = ['https://www.nfu.edu.cn/tzgg/index'+str(i)+'.htm' for i in range(1,35)]\n",
    "url_tzgg_group.insert(0,'https://www.nfu.edu.cn/tzgg/index.htm')\n",
    "url_tzgg_group"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存html文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/tzgg/index.htm'"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_tzgg_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_tzgg_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存excel文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(tzgg_parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index26.htm', 'index27.htm', 'index28.htm', 'index29.htm', 'index3.htm', 'index30.htm', 'index31.htm', 'index32.htm', 'index33.htm', 'index34.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链接</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>校园管理部关于2021年元旦放假校园生活服务安排的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/cd60e06378e5449294...</td>\n",
       "      <td>2020-12-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>中山大学南方学院关于2021年元旦放假安排的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/16fcbd56eab04220b3...</td>\n",
       "      <td>2020-12-17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>关于开展2020年知识产权竞赛的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/155655d4a7e74c7695...</td>\n",
       "      <td>2020-12-16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>中山大学南方学院关于举办2020年预防艾滋病巡讲活动的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/f381db0e5b3e4746b3...</td>\n",
       "      <td>2020-12-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>关于开展2020年安全知识竞赛的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/ae83ecc6ce894bcb81...</td>\n",
       "      <td>2020-12-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>562</th>\n",
       "      <td>2</td>\n",
       "      <td>“南苑青年”系列讲座之第十三讲的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/bbd14d55a99247a79f...</td>\n",
       "      <td>2015-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>563</th>\n",
       "      <td>3</td>\n",
       "      <td>中山大学南方学院关于举办“南方湖畔·艺彩纷呈”第七届校园文化艺术节活动通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/a8e5e752e409486da2...</td>\n",
       "      <td>2015-04-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>564</th>\n",
       "      <td>4</td>\n",
       "      <td>学院办公室关于2015年五一放假安排的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/e3f763049ee54cfc8c...</td>\n",
       "      <td>2015-04-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>565</th>\n",
       "      <td>5</td>\n",
       "      <td>中山大学南方学院关于2015年公共机构节能宣传作品征集活动的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/f3ae1aa3ccdb4d87bc...</td>\n",
       "      <td>2015-04-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>566</th>\n",
       "      <td>6</td>\n",
       "      <td>关于开展校园网络和运营商移动网络使用情况调查的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/6de44f6a618540ef82...</td>\n",
       "      <td>1970-01-01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>687 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                     标题  \\\n",
       "0        0            校园管理部关于2021年元旦放假校园生活服务安排的通知   \n",
       "1        1               中山大学南方学院关于2021年元旦放假安排的通知   \n",
       "2        2                     关于开展2020年知识产权竞赛的通知   \n",
       "3        3          中山大学南方学院关于举办2020年预防艾滋病巡讲活动的通知   \n",
       "4        4                     关于开展2020年安全知识竞赛的通知   \n",
       "..     ...                                    ...   \n",
       "562      2                     “南苑青年”系列讲座之第十三讲的通知   \n",
       "563      3  中山大学南方学院关于举办“南方湖畔·艺彩纷呈”第七届校园文化艺术节活动通知   \n",
       "564      4                  学院办公室关于2015年五一放假安排的通知   \n",
       "565      5       中山大学南方学院关于2015年公共机构节能宣传作品征集活动的通知   \n",
       "566      6              关于开展校园网络和运营商移动网络使用情况调查的通知   \n",
       "\n",
       "                                                    链接          日期  \n",
       "0    https://www.nfu.edu.cn/tzgg/cd60e06378e5449294...  2020-12-25  \n",
       "1    https://www.nfu.edu.cn/tzgg/16fcbd56eab04220b3...  2020-12-17  \n",
       "2    https://www.nfu.edu.cn/tzgg/155655d4a7e74c7695...  2020-12-16  \n",
       "3    https://www.nfu.edu.cn/tzgg/f381db0e5b3e4746b3...  2020-12-03  \n",
       "4    https://www.nfu.edu.cn/tzgg/ae83ecc6ce894bcb81...  2020-12-03  \n",
       "..                                                 ...         ...  \n",
       "562  https://www.nfu.edu.cn/tzgg/bbd14d55a99247a79f...  2015-04-08  \n",
       "563  https://www.nfu.edu.cn/tzgg/a8e5e752e409486da2...  2015-04-07  \n",
       "564  https://www.nfu.edu.cn/tzgg/e3f763049ee54cfc8c...  2015-04-01  \n",
       "565  https://www.nfu.edu.cn/tzgg/f3ae1aa3ccdb4d87bc...  2015-04-01  \n",
       "566  https://www.nfu.edu.cn/tzgg/6de44f6a618540ef82...  1970-01-01  \n",
       "\n",
       "[687 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/tzgg/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/tzgg/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链接\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_广州南方学院.xlsx',mode='a',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='通知公告')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 招投标"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### html页面数据存读"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/ztb/index.htm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_招投标week5.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_招投标week5.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()\n",
    "    html_load"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### soup_html 解析 ： str的html文件 => element html元素文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x23104613310>"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "ztb_parsed = requests_html.soup_parse(html_load)\n",
    "ztb_parsed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 解析、重组链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/ztb/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837fa4325389300f.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/ea8754261f26419080ae1933f5ae7f2a.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b972599c1c947ffe.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1096effc050fe1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/60c660848ef44283bcae0b864f06245b.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/c1f45c4ed6d24523b8015716f1354c69.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/8de22fa69c5a4718a5d3a234157a231a.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/84df006147494c74a06300e42ba5fe0f.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/5b6a96bc894e4901b9015550f495d48c.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/cee6034ea34b4d37af132488e5b08eba.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/2b0efb94d7bc43a69cf7705d8e5eb3fb.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/4ca38f35a904483aa17f8149b6e74a5f.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/d9a43543bfc04b249605e362c4d56fde.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/a94be158ee2d45629fa34fe777c524aa.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/6c02c38297c94f82a0b00e0c465ecf42.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/f312609072284e918844133b2e8d17de.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/711839de4a50406da99b621c5c60f53a.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/ef39ea1df91b4208859c3d139614b752.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/303ff597654847ad9fe7ec28cee402b3.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/3c3b1ca74f0b47e1a6f9a6e2434014aa.htm']"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in ztb_parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 翻页查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22\n"
     ]
    }
   ],
   "source": [
    "# 有多少页？\n",
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/ztb/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/ztb/index.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index2.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index3.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index4.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index5.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index6.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index7.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index8.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index9.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index10.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index11.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index12.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index13.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index14.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index15.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index16.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index17.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index18.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index19.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index20.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index21.htm']"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 实现翻页的url队列\n",
    "url_ztb_group = ['https://www.nfu.edu.cn/ztb/index'+str(i)+'.htm' for i in range(1,22)]\n",
    "url_ztb_group.insert(0,'https://www.nfu.edu.cn/ztb/index.htm')\n",
    "url_ztb_group"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存html文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/ztb/index.htm'"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_ztb_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_ztb_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存excel文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(ztb_parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index3.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链接</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目招标开标延期公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837...</td>\n",
       "      <td>2021-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目 招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ea8754261f26419080a...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>中山大学南方学院数字电路基础实验室、电路与模拟电子实验室设备采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b97...</td>\n",
       "      <td>2021-03-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>中山大学南方学院垃圾清运和处理服务项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1...</td>\n",
       "      <td>2021-03-17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>中山大学南方学院2021年度维修、改造工程施工项目中标结果公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/60c660848ef44283bca...</td>\n",
       "      <td>2021-03-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>289</th>\n",
       "      <td>9</td>\n",
       "      <td>中山大学南方学院学生体质健康测试仪采购项目招标公告（第二次）</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ba01c43761e245d4937...</td>\n",
       "      <td>2015-03-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>290</th>\n",
       "      <td>10</td>\n",
       "      <td>中山大学南方学院计算机实验室设备采购项目中标公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/0020f85b9ef24d0792d...</td>\n",
       "      <td>2015-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>291</th>\n",
       "      <td>11</td>\n",
       "      <td>中山大学南方学院电气工程及自动化实验室设备采购项目招标公告（第二次）</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/10482a669fc54447aa2...</td>\n",
       "      <td>2015-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>292</th>\n",
       "      <td>12</td>\n",
       "      <td>中山大学南方学院音乐楼阶梯课室座椅采购项目中标公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4e5e67a17b7d47cf8cc...</td>\n",
       "      <td>2015-03-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>293</th>\n",
       "      <td>13</td>\n",
       "      <td>中山大学南方学院室内高尔夫模拟设备项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/35a1b4dab36a4ae5aa4...</td>\n",
       "      <td>2013-12-23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>434 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                      标题  \\\n",
       "0        0     广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目招标开标延期公告   \n",
       "1        1        广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目 招标公告   \n",
       "2        2  中山大学南方学院数字电路基础实验室、电路与模拟电子实验室设备采购项目招标公告   \n",
       "3        3                 中山大学南方学院垃圾清运和处理服务项目招标公告   \n",
       "4        4         中山大学南方学院2021年度维修、改造工程施工项目中标结果公示   \n",
       "..     ...                                     ...   \n",
       "289      9          中山大学南方学院学生体质健康测试仪采购项目招标公告（第二次）   \n",
       "290     10                中山大学南方学院计算机实验室设备采购项目中标公示   \n",
       "291     11      中山大学南方学院电气工程及自动化实验室设备采购项目招标公告（第二次）   \n",
       "292     12               中山大学南方学院音乐楼阶梯课室座椅采购项目中标公示   \n",
       "293     13                 中山大学南方学院室内高尔夫模拟设备项目招标公告   \n",
       "\n",
       "                                                    链接          日期  \n",
       "0    https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837...  2021-04-08  \n",
       "1    https://www.nfu.edu.cn/ztb/ea8754261f26419080a...  2021-04-02  \n",
       "2    https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b97...  2021-03-31  \n",
       "3    https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1...  2021-03-17  \n",
       "4    https://www.nfu.edu.cn/ztb/60c660848ef44283bca...  2021-03-11  \n",
       "..                                                 ...         ...  \n",
       "289  https://www.nfu.edu.cn/ztb/ba01c43761e245d4937...  2015-03-27  \n",
       "290  https://www.nfu.edu.cn/ztb/0020f85b9ef24d0792d...  2015-03-26  \n",
       "291  https://www.nfu.edu.cn/ztb/10482a669fc54447aa2...  2015-03-26  \n",
       "292  https://www.nfu.edu.cn/ztb/4e5e67a17b7d47cf8cc...  2015-03-20  \n",
       "293  https://www.nfu.edu.cn/ztb/35a1b4dab36a4ae5aa4...  2013-12-23  \n",
       "\n",
       "[434 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/ztb/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/ztb/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链接\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_广州南方学院.xlsx',mode='a',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='招投标')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 高教动态"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### html页面数据存读"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/gjdt/index.htm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_高教动态week5.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_高教动态week5.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()\n",
    "    html_load"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### soup_html 解析 ： str的html文件 => element html元素文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x23105b579f0>"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "gjdt_parsed = requests_html.soup_parse(html_load)\n",
    "gjdt_parsed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 解析、重组链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/gjdt/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/gjdt/309be8b078444044b51624f0e186729e.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba7cbbc80e65b871.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88bcebb750c5dcc33.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/20dc120c250642cca5815c93591bb5cb.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/b43531427fb44695bbb0e16280988965.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/1509f4f3bc2f4babbe57c7ec29854807.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/a4b2fb3dacae4564976e9a951bcddcff.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/0e7d664c116f4a0ab9e6d165977f9def.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/71e152ddce12414388346126ba1a1b6b.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/0d7bfc95f70841a6b5c2ad85f74c3510.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/750c396e278446e687a42965b1c9a385.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/cd3ecf8986ad40e991e37f635f1ab8b4.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/3a7835bded2441aeb67bbc6f2a61471c.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/4d0eb3a8b8ee47f6b9bf0e9fda26fa4a.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/355e3d6207974a3ea62ef78d2ecc2f23.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/9b6486d83f454a2ca7a0267169ca534d.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/6bb7172f46b3458b8022b1132593b96b.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/a8f6f2c6a2c644d2998d7836f8151f58.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/f978598b61024bb2982f5fbb32f81b9d.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/3d9e9dfac15945938692c88167347ee7.htm']"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in gjdt_parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 翻页查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26\n"
     ]
    }
   ],
   "source": [
    "# 有多少页？\n",
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/gjdt/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/gjdt/index.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index1.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index2.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index3.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index4.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index5.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index6.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index7.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index8.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index9.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index10.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index11.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index12.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index13.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index14.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index15.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index16.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index17.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index18.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index19.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index20.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index21.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index22.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index23.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index24.htm',\n",
       " 'https://www.nfu.edu.cn/gjdt/index25.htm']"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 实现翻页的url队列\n",
    "url_gjdt_group = ['https://www.nfu.edu.cn/gjdt/index'+str(i)+'.htm' for i in range(1,26)]\n",
    "url_gjdt_group.insert(0,'https://www.nfu.edu.cn/gjdt/index.htm')\n",
    "url_gjdt_group"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存html文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/gjdt/index.htm'"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_gjdt_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_gjdt_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 批量存excel文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(gjdt_parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index3.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链接</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>教育部党组《求是》撰文：精心谋划 切实抓好教育系统党史学习教育</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/309be8b078444044b5...</td>\n",
       "      <td>2021-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>教育部长陈宝生：把巩固拓展作为开局之年工作主题，做到6个到位</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba...</td>\n",
       "      <td>2021-03-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>如何建设高质量教育体系？“十四五”规划和2035年远景目标纲要明确了</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88b...</td>\n",
       "      <td>2021-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>教育部长陈宝生《旗帜》撰文：建设高质量教育体系，加快建成教育强国</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/20dc120c250642cca5...</td>\n",
       "      <td>2021-01-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>重磅！《推进粤港澳大湾区高等教育合作发展规划》正式印发</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/b43531427fb44695bb...</td>\n",
       "      <td>2020-12-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>368</th>\n",
       "      <td>8</td>\n",
       "      <td>广东省教育厅：今年毕业生就业形势比去年好</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/3829e4c5df9e460abc...</td>\n",
       "      <td>2014-03-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>369</th>\n",
       "      <td>9</td>\n",
       "      <td>要求职业“高大上” 高校毕业生择业扎堆致就业难</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/776ebc41fae84b36a4...</td>\n",
       "      <td>2014-03-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>370</th>\n",
       "      <td>10</td>\n",
       "      <td>教育部：预计今年贫困地区农村学生上重点高校的人数将比去年增加10%以上</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/41d339ccb3a0464c9c...</td>\n",
       "      <td>2014-03-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>371</th>\n",
       "      <td>11</td>\n",
       "      <td>学位论文如何才能挤出“水分”</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/1e8fa309bcf847b6ad...</td>\n",
       "      <td>2014-03-24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>372</th>\n",
       "      <td>12</td>\n",
       "      <td>高校低年级学生频繁试水招聘会 专家：鼓励提前预热</td>\n",
       "      <td>https://www.nfu.edu.cn/gjdt/3f34245a7cb449c99b...</td>\n",
       "      <td>2013-03-31</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>513 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                   标题  \\\n",
       "0        0      教育部党组《求是》撰文：精心谋划 切实抓好教育系统党史学习教育   \n",
       "1        1       教育部长陈宝生：把巩固拓展作为开局之年工作主题，做到6个到位   \n",
       "2        2   如何建设高质量教育体系？“十四五”规划和2035年远景目标纲要明确了   \n",
       "3        3     教育部长陈宝生《旗帜》撰文：建设高质量教育体系，加快建成教育强国   \n",
       "4        4          重磅！《推进粤港澳大湾区高等教育合作发展规划》正式印发   \n",
       "..     ...                                  ...   \n",
       "368      8                 广东省教育厅：今年毕业生就业形势比去年好   \n",
       "369      9              要求职业“高大上” 高校毕业生择业扎堆致就业难   \n",
       "370     10  教育部：预计今年贫困地区农村学生上重点高校的人数将比去年增加10%以上   \n",
       "371     11                       学位论文如何才能挤出“水分”   \n",
       "372     12             高校低年级学生频繁试水招聘会 专家：鼓励提前预热   \n",
       "\n",
       "                                                    链接          日期  \n",
       "0    https://www.nfu.edu.cn/gjdt/309be8b078444044b5...  2021-04-08  \n",
       "1    https://www.nfu.edu.cn/gjdt/159b20971f8b4051ba...  2021-03-20  \n",
       "2    https://www.nfu.edu.cn/gjdt/27ba495edc1b49f88b...  2021-03-15  \n",
       "3    https://www.nfu.edu.cn/gjdt/20dc120c250642cca5...  2021-01-05  \n",
       "4    https://www.nfu.edu.cn/gjdt/b43531427fb44695bb...  2020-12-22  \n",
       "..                                                 ...         ...  \n",
       "368  https://www.nfu.edu.cn/gjdt/3829e4c5df9e460abc...  2014-03-28  \n",
       "369  https://www.nfu.edu.cn/gjdt/776ebc41fae84b36a4...  2014-03-27  \n",
       "370  https://www.nfu.edu.cn/gjdt/41d339ccb3a0464c9c...  2014-03-25  \n",
       "371  https://www.nfu.edu.cn/gjdt/1e8fa309bcf847b6ad...  2014-03-24  \n",
       "372  https://www.nfu.edu.cn/gjdt/3f34245a7cb449c99b...  2013-03-31  \n",
       "\n",
       "[513 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/gjdt/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/gjdt/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链接\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_广州南方学院.xlsx',mode='a',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='高教动态')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
